{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer, AutoConfig, GPTJForCausalLM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0cab8b76ce36446785c152d6bb13b992",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/619 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a41b0fe2bd2140c2ae9bc8c9a3a744b8",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "94a339af35aa421b86f300a93f7647c1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "67bbee5d3f0a4f7e84dbd2d05f23f5bf",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/1.37M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e3b2f6206d054a4c90ca390c226ffcc2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/4.04k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "29450c1c432c4ca8a42a969050d3b0aa",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/357 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained(\"EleutherAI/gpt-j-6B\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer.pad_token_id = 0\n",
    "tokenizer.padding_side = \"left\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "data\t\t\t    detect-lang.ipynb\t       translate.ipynb\r\n",
      "databricks-dolly-15k.jsonl  translated-dolly-15k.json\r\n"
     ]
    }
   ],
   "source": [
    "!ls ../dolly"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open('/home/husein/ssd3/dolly/translated-dolly-15k.json') as fopen:\n",
    "    dolly = json.load(fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "\n",
    "malay_respond = [\n",
    "    ' respond in malay',\n",
    "    ' reply in malay',\n",
    "    ' reply bahasa melayu',\n",
    "    ' ckp bahasa melayu',\n",
    "    ''\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "template = {\n",
    "    \"description\": \"Template used by Alpaca-LoRA.\",\n",
    "    \"prompt_input\": \"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\\n\\n### Instruction:\\n{instruction}\\n\\n### Input:\\n{input}\\n\\n### Response:\\n\",\n",
    "    \"prompt_no_input\": \"Below is an instruction that describes a task. Write a response that appropriately completes the request.\\n\\n### Instruction:\\n{instruction}\\n\\n### Response:\\n\",\n",
    "    \"response_split\": \"### Response:\"    \n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Madu'"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "json.loads(dolly[0]['response'][1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "# print(dolly)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_dolly = []\n",
    "for d in dolly:\n",
    "    try:\n",
    "        if random.random() > 0.5:\n",
    "            q = d['instruction'][0] + random.choice(malay_respond)\n",
    "        else:\n",
    "            q = d['instruction'][1]\n",
    "\n",
    "        res = template[\"prompt_no_input\"].format(instruction=q)\n",
    "        res = f\"{res}{d['response'][1]}\"\n",
    "        data_dolly.append(res)\n",
    "    except:\n",
    "        pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "15012"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(data_dolly)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Below is an instruction that describes a task. Write a response that appropriately completes the request.\\n\\n### Instruction:\\nIn France if you were served le miel what would you eat respond in malay\\n\\n### Response:\\n\"Madu\"',\n",
       " 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\\n\\n### Instruction:\\n\"Apakah makna kehidupan?\"\\n\\n### Response:\\n\"Saya tidak, pergilah baca beberapa buku.\"',\n",
       " 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\\n\\n### Instruction:\\nFrank Sinatra changed the lyrics of \"The Lady is a Tramp\" to \"The Lady is a\" what? respond in malay\\n\\n### Response:\\n\"Juara\".',\n",
       " 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\\n\\n### Instruction:\\nDi dalam buku manakah anda akan menjumpai pembantu lelaki Pas Partout?\\n\\n### Response:\\n\"Keliling dunia dalam 80 hari\"',\n",
       " 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\\n\\n### Instruction:\\n\"Dalam siri A Song of Ice and Fire, siapakah pengasas House Thenn?\"\\n\\n### Response:\\n\\'Sigorn, anak Styr\\'',\n",
       " 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\\n\\n### Instruction:\\n\"Bilakah episod pertama Big Brother ditayangkan di CBS?\"\\n\\n### Response:\\nEpisod pertama Big Brother ditayangkan di CBS pada bulan Julai 2000.',\n",
       " 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\\n\\n### Instruction:\\nWhich Baseball team won the 2016 World Series? reply in malay\\n\\n### Response:\\nChicago Cubs memenangi Siri Dunia Bola Sepak pada tahun 2016.',\n",
       " 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\\n\\n### Instruction:\\n\"Apakah beberapa kumpulan rock progresif yang hebat?\"\\n\\n### Response:\\nRUSH, Ya, Genesis, Projek Alan Parsons, Pink Floyd, King Crimson, Emerson Lake dan Palmer, Jethro Tull.',\n",
       " 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\\n\\n### Instruction:\\n\"Apakah bukti pertama tentang boomerang?\"\\n\\n### Response:\\nBukti awal tentang bumerang di Australia adalah kira-kira 20,000 tahun yang lalu.',\n",
       " 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\\n\\n### Instruction:\\nKenal pasti haiwan yang dijinakkan atau liar: Kupu-kupu sutera, siput Roman.\\n\\n### Response:\\n\"Silkworm adalah ternakan, siput Roman adalah liar.\"']"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_dolly[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('prepared-dolly.json', 'w') as fopen:\n",
    "    json.dump(data_dolly, fopen)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
